from passengers import Passengers, Line
lines = {}
file_name = 'data/t091307.xlsx'
line = '田園都市線'
skip_rows = list(range(0, 11)) + list(range(12, 15))
use_cols = list(range(1, 100))
lines[line] = Line(file_name, line, skip_rows, use_cols)
file_name = 'data/t091306.xlsx'
line = '東横線'
skip_rows = list(range(0, 12)) + list(range(13, 16))
use_cols = list(range(1, 100))
lines[line] = Line(file_name, line, skip_rows, use_cols)
file_name = 'data/t091305.xlsx'
line = '京浜急行線'
skip_rows = list(range(0, 12)) + list(range(13, 16))
use_cols = list(range(1, 100))
lines[line] = Line(file_name, line, skip_rows, use_cols)
file_name = 'data/t091304.xlsx'
line = '相模鉄道線'
skip_rows = list(range(0, 12)) + list(range(13, 16))
use_cols = list(range(1, 100))
lines[line] = Line(file_name, line, skip_rows, use_cols)
file_name = 'data/t091303.xlsx'
line = 'みなとみらい線'
skip_rows = list(range(0, 11)) + list(range(12, 15))
use_cols = list(range(1, 100))
lines[line] = Line(file_name, line, skip_rows, use_cols)
file_name = 'data/t091302.xlsx'
line = '金沢シーサイドライン'
skip_rows = list(range(0, 11)) + list(range(12, 15))
use_cols = list(range(1, 100))
lines[line] = Line(file_name, line, skip_rows, use_cols)
file_name = 'data/t091203.xlsx'
line = 'グリーンライン'
skip_rows = list(range(0, 11)) + list(range(12, 15))
use_cols = list(range(1, 100))
lines[line] = Line(file_name, line, skip_rows, use_cols)
file_name = 'data/t091202.xlsx'
line = 'ブルーライン'
skip_rows = list(range(0, 11)) + list(range(12, 15))
use_cols = list(range(1, 100))
lines[line] = Line(file_name, line, skip_rows, use_cols)
ps = Passengers()
for line in lines.values():
ps.read_csv(line=line)
display(ps.passengers.head())
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tslearn.utils import to_time_series_dataset
from tslearn.clustering import TimeSeriesKMeans, KShape
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
for column in ps.passengers.columns:
if re.match(".*総数.*", column):
ps.passengers = ps.passengers.drop(column, axis=1)
import logging
logging.basicConfig()
logger = logging.getLogger()
logger.setLevel(logging.WARN)
import itertools
import calendar
from collections import Counter
class Visualizer(object):
def __init__(self):
self._markers = itertools.cycle([
'o', 'v', '^', '<', '>', '1', '2', '3', '4', '8',
's', 'p', '*', 'h', 'H', '+', 'x', 'D', 'd', 'P'])
self._colors = itertools.cycle(['b', 'g', 'r', 'c', 'm', 'y'])
self._name_to_marker = {}
def _make_xticks_and_xlabels(self, xs_ticks, xtick_step, xs_labels):
"""グラフのx軸ラベル表示のために、特定単位のxtick, xlabelを生成。
"""
shown_xticks = []
shown_xlabels = []
for xtick, xlabel in zip(xs_ticks, xs_labels):
check_digit = 0
check_digit |= 1 if xlabel.minute == 0 else 0
check_digit |= 2 if xlabel.hour == 0 else 0
check_digit |= 4 if xlabel.day == 1 else 0
if (xtick_step == 'h' and (check_digit & 1) == 1) \
or (xtick_step == 'd' and (check_digit & 3) == 3) \
or (xtick_step == 'm' and (check_digit & 7) == 7):
shown_xticks.append(xtick)
shown_xlabels.append(
'{} ({})'.format(str(xlabel), calendar.day_abbr[xlabel.weekday()]))
logger.info('Adding xtick, xlabel: {}, {}'.format(xtick, xlabel))
return shown_xticks, shown_xlabels
def draw_graphs(self, df, cnt=None, scatter=True,
xlabel='Time', xtick_step='m',
ylabel='Power Consumption', ylog=False,
figsize=(12,8), dotsize=3, alpha=0.3,
xlim=False):
plt.subplots(1, figsize=figsize)
# y軸: logスケール?
if ylog:
plt.yscale('log')
# グラフを描く。
xs_labels = df.index
xs_ticks = np.arange(len(xs_labels))
ys_columns = df.columns.tolist()
for ys_col in ys_columns:
ys = df[ys_col]
# グラフの名前とそれのmarker/color情報を保存。
if ys_col not in self._name_to_marker:
self._name_to_marker[ys_col] = {
'marker':next(self._markers),
'color':next(self._colors)}
# cnt != Noneの場合、各グラフの名前に追加情報を付着。
if cnt:
ys_label = '{}: {}'.format(ys_col, cnt[ys_col])
else:
ys_label = ys_col
# scatter/lineグラフを描く。
if scatter:
plt.scatter(
xs_labels, ys,
marker=self._name_to_marker[ys_col]['marker'],
c=self._name_to_marker[ys_col]['color'],
s=dotsize, alpha=alpha,
label=ys_label)
else:
plt.plot(
xs_labels, ys,
marker=self._name_to_marker[ys_col]['marker'],
c=self._name_to_marker[ys_col]['color'],
markersize=dotsize, alpha=alpha,
label=ys_label)
logger.info('Plotting graph: {}'.format(ys_label))
# そのほか、Canvas上の設定を行う。
shown_xticks, shown_xlabels = self._make_xticks_and_xlabels(
xs_labels, xtick_step, xs_labels)
plt.xticks(shown_xticks, shown_xlabels, rotation=90)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
if xlim:
plt.xlim(xlim)
#plt.grid()
lgnd = plt.legend(loc='upper left', ncol=2, markerscale=3)
plt.tight_layout()
# Show it.
plt.plot()
# Visualizerオブジェクトを生成。
viz = Visualizer()
yss = to_time_series_dataset(ps.passengers.values.T)
yss = TimeSeriesScalerMeanVariance(mu=0.0, std=1.0).fit_transform(yss)
n_clusters = 8
n_init = 3
rand_seed = 13
ks = KShape(n_clusters=n_clusters,
n_init=n_init,
verbose=True,
random_state=rand_seed)
ks.fit(yss)
data = []
for cluster_x in ks.cluster_centers_:
data.append(
[point_x[0] for point_x in cluster_x]
)
columns = ['cluster-{}'.format(idx) for idx in range(n_clusters)]
clusters = np.array(data).T
df_clusters = pd.DataFrame(clusters, columns=columns)
df_clusters.index = ps.passengers.index
# クラスタに属するデータ数を計算。
cnt = Counter(ks.labels_)
cluster_labels = {}
for k in cnt:
cluster_labels['cluster-{}'.format(k)] = cnt[k]
# グラフを表示。
viz.draw_graphs(df_clusters, cnt=cluster_labels,
xtick_step='m',
alpha=0.4)
for i in range(8):
target_clusters = ['cluster-{}'.format(idx) for idx in [i]]
filtered_df_clusters = df_clusters[target_clusters]
viz.draw_graphs(filtered_df_clusters,
cnt=cluster_labels,
xtick_step='m',
scatter=False,
alpha=0.4)
for i in range(8):
print("cluster " + str(i))
tmp_df = []
for j, label in enumerate(ks.labels_):
if label == i:
tmp_df.append(ps.passengers.columns[j])
print(tmp_df)
clustering_metric = 'euclidean'
km = TimeSeriesKMeans(
n_init=n_init,
n_clusters=n_clusters,
metric=clustering_metric,
random_state=rand_seed)
km.fit(yss)
data = []
for cluster_x in km.cluster_centers_:
data.append(
[point_x[0] for point_x in cluster_x]
)
columns = ['cluster-{}'.format(idx) for idx in range(n_clusters)]
clusters = np.array(data).T
df_clusters = pd.DataFrame(clusters, columns=columns)
df_clusters.index = ps.passengers.index
# クラスタに属するデータ数を計算。
cnt = Counter(km.labels_)
cluster_labels = {}
for k in cnt:
cluster_labels['cluster-{}'.format(k)] = cnt[k]
# グラフを表示。
viz.draw_graphs(df_clusters, cnt=cluster_labels,
xtick_step='m',
alpha=0.4)
for i in range(8):
target_clusters = ['cluster-{}'.format(idx) for idx in [i]]
filtered_df_clusters = df_clusters[target_clusters]
viz.draw_graphs(filtered_df_clusters,
cnt=cluster_labels,
xtick_step='m',
scatter=False,
alpha=0.4)
for i in range(8):
print("cluster " + str(i))
tmp_df = []
for j, label in enumerate(km.labels_):
if label == i:
tmp_df.append(ps.passengers.columns[j])
print(tmp_df)
yss = to_time_series_dataset(ps.passengers.values.T)
# yss = TimeSeriesScalerMeanVariance(mu=0.0, std=1.0).fit_transform(yss)
n_clusters = 8
n_init = 3
rand_seed = 13
ks = KShape(n_clusters=n_clusters,
n_init=n_init,
verbose=True,
random_state=rand_seed)
ks.fit(yss)
data = []
for cluster_x in ks.cluster_centers_:
data.append(
[point_x[0] for point_x in cluster_x]
)
columns = ['cluster-{}'.format(idx) for idx in range(n_clusters)]
clusters = np.array(data).T
df_clusters = pd.DataFrame(clusters, columns=columns)
df_clusters.index = ps.passengers.index
# クラスタに属するデータ数を計算。
cnt = Counter(ks.labels_)
cluster_labels = {}
for k in cnt:
cluster_labels['cluster-{}'.format(k)] = cnt[k]
# グラフを表示。
viz.draw_graphs(df_clusters, cnt=cluster_labels,
xtick_step='m',
alpha=0.4)
for i in range(8):
target_clusters = ['cluster-{}'.format(idx) for idx in [i]]
filtered_df_clusters = df_clusters[target_clusters]
viz.draw_graphs(filtered_df_clusters,
cnt=cluster_labels,
xtick_step='m',
scatter=False,
alpha=0.4)
for i in range(8):
print("cluster " + str(i))
tmp_df = []
for j, label in enumerate(ks.labels_):
if label == i:
tmp_df.append(ps.passengers.columns[j])
print(tmp_df)
clustering_metric = 'euclidean'
km = TimeSeriesKMeans(
n_init=n_init,
n_clusters=n_clusters,
metric=clustering_metric,
random_state=rand_seed)
km.fit(yss)
data = []
for cluster_x in km.cluster_centers_:
data.append(
[point_x[0] for point_x in cluster_x]
)
columns = ['cluster-{}'.format(idx) for idx in range(n_clusters)]
clusters = np.array(data).T
df_clusters = pd.DataFrame(clusters, columns=columns)
df_clusters.index = ps.passengers.index
# クラスタに属するデータ数を計算。
cnt = Counter(km.labels_)
cluster_labels = {}
for k in cnt:
cluster_labels['cluster-{}'.format(k)] = cnt[k]
# グラフを表示。
viz.draw_graphs(df_clusters, cnt=cluster_labels,
xtick_step='m',
alpha=0.4)
for i in range(8):
target_clusters = ['cluster-{}'.format(idx) for idx in [i]]
filtered_df_clusters = df_clusters[target_clusters]
viz.draw_graphs(filtered_df_clusters,
cnt=cluster_labels,
xtick_step='m',
scatter=False,
alpha=0.4)
for i in range(8):
print("cluster " + str(i))
tmp_df = []
for j, label in enumerate(km.labels_):
if label == i:
tmp_df.append(ps.passengers.columns[j])
print(tmp_df)